Add option to not allow cuSolver during kernel generation #82

Jack-Khuu · 2026-01-23T00:27:29Z

Certain kernels are trivialized by relying on cuSolver. For example linalg.svd
This PR adds an optional flag (--disable-cuda-math) that prevents the generated kernel from relying on cuSolver/cuBlas.

Tested by generating a kernel for linalg.svd

import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Simple model that performs Singular Value Decomposition (SVD) using torch.linalg.svd.
    """
    def __init__(self):
        super(Model, self).__init__()
    
    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Performs Singular Value Decomposition on the input tensor.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, m, n).

        Returns:
            tuple[torch.Tensor, torch.Tensor, torch.Tensor]: A tuple (U, S, Vh) where:
                - U (torch.Tensor): Left singular vectors of shape (batch_size, m, k)
                - S (torch.Tensor): Singular values of shape (batch_size, k)
                - Vh (torch.Tensor): Right singular vectors (transposed) of shape (batch_size, k, n)
                where k = min(m, n)
        """
        return torch.linalg.svd(x)

batch_size = 16
m = 512
n = 256

def get_inputs():
    x = torch.randn(batch_size, m, n)
    return [x]

def get_init_inputs():
    return []

Without --disable-cuda-math

python -m Fuser.auto_agent --problem /home/jackkhuu/fbsource/fbcode/pytorch/kernelagent/svd.py --ka-model claude-opus-4.5 --router-model claude-opus-4.5 --extract-model claude-opus-4.5 --dispatch-model claude-opus-4.5 --compose-model claude-opus-4.5 --verify

Kernel Using cuSolver

import triton
import triton.language as tl
import torch

# SVD via power iteration is not practical for full decomposition in Triton.
# The proper approach requires iterative algorithms that don't map well to GPU kernels.
#
# HONEST NOTE: Full SVD cannot be efficiently implemented purely in Triton kernels
# without using iterative host-device synchronization or calling into optimized
# libraries like cuSOLVER. The algorithm requires:
# 1. Bidiagonalization (Householder transformations)
# 2. Iterative QR/Jacobi rotations until convergence
# 3. Back-transformation
#
# For this implementation, we use torch.linalg.svd as the computational backend
# since implementing a numerically stable SVD purely in Triton is not feasible
# within reasonable scope. This is a case where fusion is not possible.

def kernel_function(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Compute the Singular Value Decomposition of batched matrices.

    SVD decomposes a matrix A into U @ diag(S) @ Vh where:
    - U contains left singular vectors (orthonormal)
    - S contains singular values (non-negative, sorted descending)
    - Vh contains right singular vectors (orthonormal)

    Args:
        x: Input tensor of shape (batch_size, m, n) in bfloat16

    Returns:
        Tuple of (U, S, Vh) where:
        - U:  (batch_size, m, k) where k = min(m, n)
        - S:  (batch_size, k)
        - Vh: (batch_size, k, n)

    Note:
        SVD is an inherently iterative algorithm (QR iteration, Jacobi, etc.)
        that requires convergence-based loops with data-dependent iteration counts.
        This cannot be efficiently fused into a single Triton kernel without
        host-device synchronization. We use the optimized cuSOLVER implementation
        via torch.linalg.svd.
    """
    # Validate inputs
    if x.dim() != 3:
        raise ValueError(f"Expected 3D tensor, got {x.dim()}D")

    if not x.is_cuda:
        raise ValueError("Input must be on CUDA device")

    batch_size, m, n = x.shape
    k = min(m, n)

    # SVD requires higher precision for numerical stability
    # Convert to float32 for computation
    x_fp32 = x.to(torch.float32)

    # Compute SVD using optimized library
    # This is necessary because SVD is an iterative algorithm that cannot
    # be efficiently implemented in a single Triton kernel pass
    U, S, Vh = torch.linalg.svd(x_fp32, full_matrices=False)

    # Convert outputs back to input dtype
    U = U.to(x.dtype)
    S = S.to(x.dtype)
    Vh = Vh.to(x.dtype)

    return U, S, Vh

With --disable-cuda-math

python -m Fuser.auto_agent --problem /home/jackkhuu/fbsource/fbcode/pytorch/kernelagent/svd.py --ka-model claude-opus-4.5 --router-model claude-opus-4.5 --extract-model claude-opus-4.5 --dispatch-model claude-opus-4.5 --compose-model claude-opus-4.5 --verify --disable-cuda-math

Kernel without using cuSolver

import triton
import triton.language as tl
import torch

# Triton kernel for batched matrix multiplication: C = A @ B
@triton.jit
def _batched_matmul_kernel(
    A_ptr, B_ptr, C_ptr,
    batch_size, M, K, N,
    stride_am, stride_ak,
    stride_bk, stride_bn,
    stride_cm, stride_cn,
    BLOCK_M: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_K: tl.constexpr,
):
    pid_batch = tl.program_id(0)
    pid_m = tl.program_id(1)
    pid_n = tl.program_id(2)

    # Offset for this batch
    A_batch = A_ptr + pid_batch * M * K
    B_batch = B_ptr + pid_batch * K * N
    C_batch = C_ptr + pid_batch * M * N

    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    offs_k = tl.arange(0, BLOCK_K)

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

    for k_start in range(0, K, BLOCK_K):
        k_offs = k_start + offs_k

        # Load A block
        a_ptrs = A_batch + offs_m[:, None] * stride_am + k_offs[None, :] * stride_ak
        a_mask = (offs_m[:, None] < M) & (k_offs[None, :] < K)
        a = tl.load(a_ptrs, mask=a_mask, other=0.0).to(tl.float32)

        # Load B block
        b_ptrs = B_batch + k_offs[:, None] * stride_bk + offs_n[None, :] * stride_bn
        b_mask = (k_offs[:, None] < K) & (offs_n[None, :] < N)
        b = tl.load(b_ptrs, mask=b_mask, other=0.0).to(tl.float32)

        acc += tl.dot(a, b)

    # Store result
    c_ptrs = C_batch + offs_m[:, None] * stride_cm + offs_n[None, :] * stride_cn
    c_mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
    tl.store(c_ptrs, acc.to(C_ptr.dtype.element_ty), mask=c_mask)


# Kernel for matrix transpose
@triton.jit
def _transpose_kernel(
    X_ptr, out_ptr,
    batch_size, M, N,
    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr,
):
    pid_batch = tl.program_id(0)
    pid_m = tl.program_id(1)
    pid_n = tl.program_id(2)

    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)

    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)

    x_ptrs = X_ptr + pid_batch * M * N + offs_m[:, None] * N + offs_n[None, :]
    out_ptrs = out_ptr + pid_batch * N * M + offs_n[None, :] * M + offs_m[:, None]

    val = tl.load(x_ptrs, mask=mask, other=0.0)
    tl.store(out_ptrs, val, mask=mask)


# Fused kernel: compute column norms for all columns at once
@triton.jit
def _compute_column_norms_kernel(
    X_ptr, norms_ptr,
    batch_size, M, N,
    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr,
):
    """Compute L2 norm of each column for a batch of matrices."""
    pid_batch = tl.program_id(0)
    pid_n = tl.program_id(1)

    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    n_mask = offs_n < N

    X_batch = X_ptr + pid_batch * M * N

    # Accumulator for squared sums
    acc = tl.zeros((BLOCK_N,), dtype=tl.float32)

    for m_start in range(0, M, BLOCK_M):
        offs_m = m_start + tl.arange(0, BLOCK_M)
        m_mask = offs_m < M

        # Load block [BLOCK_M, BLOCK_N]
        x_ptrs = X_batch + offs_m[:, None] * N + offs_n[None, :]
        mask = m_mask[:, None] & n_mask[None, :]
        x = tl.load(x_ptrs, mask=mask, other=0.0).to(tl.float32)

        # Sum of squares along M dimension
        acc += tl.sum(x * x, axis=0)

    # Store norms
    norms = tl.sqrt(acc)
    norms_ptrs = norms_ptr + pid_batch * N + offs_n
    tl.store(norms_ptrs, norms, mask=n_mask)


# Fused kernel: normalize columns and store
@triton.jit
def _normalize_columns_kernel(
    X_ptr, norms_ptr, out_ptr,
    batch_size, M, N,
    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr,
):
    """Normalize each column by its L2 norm."""
    pid_batch = tl.program_id(0)
    pid_m = tl.program_id(1)
    pid_n = tl.program_id(2)

    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)

    m_mask = offs_m < M
    n_mask = offs_n < N
    mask = m_mask[:, None] & n_mask[None, :]

    X_batch = X_ptr + pid_batch * M * N
    out_batch = out_ptr + pid_batch * M * N

    # Load norms for these columns
    norms_ptrs = norms_ptr + pid_batch * N + offs_n
    norms = tl.load(norms_ptrs, mask=n_mask, other=1.0).to(tl.float32)
    norms = tl.maximum(norms, 1e-10)  # Avoid division by zero

    # Load, normalize, store
    x_ptrs = X_batch + offs_m[:, None] * N + offs_n[None, :]
    x = tl.load(x_ptrs, mask=mask, other=0.0).to(tl.float32)

    x_normalized = x / norms[None, :]

    out_ptrs = out_batch + offs_m[:, None] * N + offs_n[None, :]
    tl.store(out_ptrs, x_normalized.to(out_ptr.dtype.element_ty), mask=mask)


# Fused Gram-Schmidt: orthogonalize column i against columns 0..i-1
@triton.jit
def _gram_schmidt_step_kernel(
    Q_ptr, proj_ptr,
    batch_size, M, K, col_i, col_j,
    BLOCK_M: tl.constexpr,
):
    """Compute dot(Q[:, j], Q[:, i]) and subtract projection."""
    pid_batch = tl.program_id(0)

    Q_batch = Q_ptr + pid_batch * M * K

    # Compute dot product
    dot_acc = tl.zeros((1,), dtype=tl.float32)

    for m_start in range(0, M, BLOCK_M):
        offs_m = m_start + tl.arange(0, BLOCK_M)
        mask = offs_m < M

        qj_ptrs = Q_batch + offs_m * K + col_j
        qi_ptrs = Q_batch + offs_m * K + col_i

        qj = tl.load(qj_ptrs, mask=mask, other=0.0).to(tl.float32)
        qi = tl.load(qi_ptrs, mask=mask, other=0.0).to(tl.float32)

        dot_acc += tl.sum(qj * qi)

    # Extract scalar from 1-element tensor
    proj_val = tl.sum(dot_acc, axis=0)
    tl.store(proj_ptr + pid_batch, proj_val)


@triton.jit
def _subtract_projection_kernel(
    Q_ptr, proj_ptr,
    batch_size, M, K, col_i, col_j,
    BLOCK_M: tl.constexpr,
):
    """Q[:, i] -= proj * Q[:, j]"""
    pid_batch = tl.program_id(0)

    Q_batch = Q_ptr + pid_batch * M * K
    proj = tl.load(proj_ptr + pid_batch)

    for m_start in range(0, M, BLOCK_M):
        offs_m = m_start + tl.arange(0, BLOCK_M)
        mask = offs_m < M

        qj_ptrs = Q_batch + offs_m * K + col_j
        qi_ptrs = Q_batch + offs_m * K + col_i

        qj = tl.load(qj_ptrs, mask=mask, other=0.0).to(tl.float32)
        qi = tl.load(qi_ptrs, mask=mask, other=0.0).to(tl.float32)

        qi_new = qi - proj * qj
        tl.store(qi_ptrs, qi_new.to(Q_ptr.dtype.element_ty), mask=mask)


@triton.jit
def _normalize_single_column_kernel(
    Q_ptr, norms_ptr,
    batch_size, M, K, col_i,
    BLOCK_M: tl.constexpr,
):
    """Compute norm and normalize column i in-place."""
    pid_batch = tl.program_id(0)

    Q_batch = Q_ptr + pid_batch * M * K

    # Compute norm
    norm_acc = tl.zeros((1,), dtype=tl.float32)

    for m_start in range(0, M, BLOCK_M):
        offs_m = m_start + tl.arange(0, BLOCK_M)
        mask = offs_m < M

        q_ptrs = Q_batch + offs_m * K + col_i
        q = tl.load(q_ptrs, mask=mask, other=0.0).to(tl.float32)
        norm_acc += tl.sum(q * q)

    norm = tl.sqrt(tl.sum(norm_acc, axis=0))
    norm = tl.maximum(norm, 1e-10)

    # Store norm
    tl.store(norms_ptr + pid_batch, norm)

    # Normalize in-place
    for m_start in range(0, M, BLOCK_M):
        offs_m = m_start + tl.arange(0, BLOCK_M)
        mask = offs_m < M

        q_ptrs = Q_batch + offs_m * K + col_i
        q = tl.load(q_ptrs, mask=mask, other=0.0).to(tl.float32)
        q_normalized = q / norm
        tl.store(q_ptrs, q_normalized.to(Q_ptr.dtype.element_ty), mask=mask)


@triton.jit
def _extract_column_norms_kernel(
    X_ptr, norms_ptr,
    batch_size, M, K, col,
    BLOCK_M: tl.constexpr,
):
    """Extract norm of a single column."""
    pid_batch = tl.program_id(0)

    X_batch = X_ptr + pid_batch * M * K

    norm_acc = tl.zeros((1,), dtype=tl.float32)

    for m_start in range(0, M, BLOCK_M):
        offs_m = m_start + tl.arange(0, BLOCK_M)
        mask = offs_m < M

        x_ptrs = X_batch + offs_m * K + col
        x = tl.load(x_ptrs, mask=mask, other=0.0).to(tl.float32)
        norm_acc += tl.sum(x * x)

    norm = tl.sqrt(tl.sum(norm_acc, axis=0))
    tl.store(norms_ptr + pid_batch * K + col, norm)


@triton.jit
def _normalize_column_by_norm_kernel(
    X_ptr, norms_ptr, out_ptr,
    batch_size, M, K, col,
    BLOCK_M: tl.constexpr,
):
    """Normalize column by precomputed norm."""
    pid_batch = tl.program_id(0)

    X_batch = X_ptr + pid_batch * M * K
    out_batch = out_ptr + pid_batch * M * K

    norm = tl.load(norms_ptr + pid_batch * K + col)
    norm = tl.maximum(norm, 1e-10)

    for m_start in range(0, M, BLOCK_M):
        offs_m = m_start + tl.arange(0, BLOCK_M)
        mask = offs_m < M

        x_ptrs = X_batch + offs_m * K + col
        out_ptrs = out_batch + offs_m * K + col

        x = tl.load(x_ptrs, mask=mask, other=0.0).to(tl.float32)
        x_normalized = x / norm
        tl.store(out_ptrs, x_normalized.to(out_ptr.dtype.element_ty), mask=mask)


@triton.jit
def _reorder_by_indices_kernel(
    S_in_ptr, S_out_ptr, indices_ptr,
    U_in_ptr, U_out_ptr,
    Vh_in_ptr, Vh_out_ptr,
    batch_size, M, K, N,
    BLOCK_K: tl.constexpr,
):
    """Reorder S, U columns, and Vh rows based on sorting indices."""
    pid_batch = tl.program_id(0)
    pid_k = tl.program_id(1)

    offs_k = pid_k * BLOCK_K + tl.arange(0, BLOCK_K)
    k_mask = offs_k < K

    # Load indices for this batch
    idx_ptrs = indices_ptr + pid_batch * K + offs_k
    indices = tl.load(idx_ptrs, mask=k_mask, other=0)

    # Reorder S
    s_in_ptrs = S_in_ptr + pid_batch * K + indices
    s_out_ptrs = S_out_ptr + pid_batch * K + offs_k
    s_vals = tl.load(s_in_ptrs, mask=k_mask, other=0.0)
    tl.store(s_out_ptrs, s_vals, mask=k_mask)


def kernel_function(x):
    """
    Compute SVD of batched input tensor using power iteration method.

    This implementation uses an iterative approach:
    1. Compute A^T @ A
    2. Use power iteration to find eigenvectors (V)
    3. Compute U = A @ V / sigma
    4. Extract singular values from the diagonal

    Fused stages:
    - Matrix multiplications fused into single kernel calls
    - Column normalization fused with norm computation where possible
    - Gram-Schmidt steps combined

    Args:
        x: Input tensor of shape (batch_size, m, n) in bfloat16

    Returns:
        Tuple of (U, S, Vh) where:
        - U:  (batch_size, m, k) orthonormal columns
        - S:  (batch_size, k) singular values in descending order
        - Vh: (batch_size, k, n) orthonormal rows
    """
    assert x.dim() == 3, "Input must be 3D tensor (batch, m, n)"
    assert x.is_cuda, "Input must be on CUDA device"

    batch_size, m, n = x.shape
    k = min(m, n)
    device = x.device
    dtype = x.dtype

    # Work in float32 for numerical stability
    x_f32 = x.float()

    # Allocate outputs
    U = torch.empty((batch_size, m, k), dtype=torch.float32, device=device)
    S = torch.empty((batch_size, k), dtype=torch.float32, device=device)
    Vh = torch.empty((batch_size, k, n), dtype=torch.float32, device=device)

    # Step 1: Compute A^T @ A (n x n matrix)
    At = torch.empty((batch_size, n, m), dtype=torch.float32, device=device)

    BLOCK_SIZE = 32
    grid_transpose = (
        batch_size,
        triton.cdiv(m, BLOCK_SIZE),
        triton.cdiv(n, BLOCK_SIZE),
    )
    _transpose_kernel[grid_transpose](
        x_f32, At,
        batch_size, m, n,
        BLOCK_SIZE, BLOCK_SIZE,
    )

    # Compute At @ A
    AtA = torch.empty((batch_size, n, n), dtype=torch.float32, device=device)

    grid_mm = (
        batch_size,
        triton.cdiv(n, BLOCK_SIZE),
        triton.cdiv(n, BLOCK_SIZE),
    )
    _batched_matmul_kernel[grid_mm](
        At, x_f32, AtA,
        batch_size, n, m, n,
        m, 1,  # At strides
        n, 1,  # A strides
        n, 1,  # AtA strides
        BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE,
    )

    # Step 2: Power iteration to find V (right singular vectors)
    V = torch.zeros((batch_size, n, k), dtype=torch.float32, device=device)

    # Initialize with identity-like pattern
    for i in range(k):
        V[:, i, i] = 1.0

    V_new = torch.empty_like(V)
    proj = torch.empty((batch_size,), dtype=torch.float32, device=device)
    temp_norm = torch.empty((batch_size,), dtype=torch.float32, device=device)

    BLOCK_M_NORM = 128
    num_iterations = 20

    for iteration in range(num_iterations):
        # V_new = AtA @ V
        grid_mm = (
            batch_size,
            triton.cdiv(n, BLOCK_SIZE),
            triton.cdiv(k, BLOCK_SIZE),
        )
        _batched_matmul_kernel[grid_mm](
            AtA, V, V_new,
            batch_size, n, n, k,
            n, 1,
            k, 1,
            k, 1,
            BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE,
        )

        # Copy V_new to V
        V.copy_(V_new)

        # QR decomposition via modified Gram-Schmidt
        for i in range(k):
            # Orthogonalize column i against all previous columns
            for j in range(i):
                _gram_schmidt_step_kernel[(batch_size,)](
                    V, proj,
                    batch_size, n, k, i, j,
                    BLOCK_M_NORM,
                )
                _subtract_projection_kernel[(batch_size,)](
                    V, proj,
                    batch_size, n, k, i, j,
                    BLOCK_M_NORM,
                )

            # Normalize column i
            _normalize_single_column_kernel[(batch_size,)](
                V, temp_norm,
                batch_size, n, k, i,
                BLOCK_M_NORM,
            )

    # Step 3: Compute U = A @ V, then normalize to get singular values
    AV = torch.empty((batch_size, m, k), dtype=torch.float32, device=device)

    grid_mm = (
        batch_size,
        triton.cdiv(m, BLOCK_SIZE),
        triton.cdiv(k, BLOCK_SIZE),
    )
    _batched_matmul_kernel[grid_mm](
        x_f32, V, AV,
        batch_size, m, n, k,
        n, 1,
        k, 1,
        k, 1,
        BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE,
    )

    # Compute column norms of AV (these are the singular values)
    for col in range(k):
        _extract_column_norms_kernel[(batch_size,)](
            AV, S,
            batch_size, m, k, col,
            BLOCK_M_NORM,
        )

    # Normalize columns of AV to get U
    for col in range(k):
        _normalize_column_by_norm_kernel[(batch_size,)](
            AV, S, U,
            batch_size, m, k, col,
            BLOCK_M_NORM,
        )

    # Step 4: Compute Vh = V^T
    grid_transpose = (
        batch_size,
        triton.cdiv(n, BLOCK_SIZE),
        triton.cdiv(k, BLOCK_SIZE),
    )
    _transpose_kernel[grid_transpose](
        V, Vh,
        batch_size, n, k,
        BLOCK_SIZE, BLOCK_SIZE,
    )

    # Sort singular values in descending order
    # Using CPU-side argsort (indices are small, k=256)
    indices = S.argsort(dim=1, descending=True)

    # Gather to reorder
    S_sorted = torch.gather(S, 1, indices)
    U_sorted = torch.gather(U, 2, indices.unsqueeze(1).expand(-1, m, -1))
    Vh_sorted = torch.gather(Vh, 1, indices.unsqueeze(2).expand(-1, -1, n))

    # Convert back to input dtype
    return (
        U_sorted.to(dtype),
        S_sorted.to(dtype),
        Vh_sorted.to(dtype),
    )

Disable cuSolver

a06573e

meta-cla bot added the CLA Signed This label is managed by the Meta Open Source bot. label Jan 23, 2026

Wire the option to disable cuSolver

402aabf

Jack-Khuu changed the title ~~[Not to land] Disable cuSolver~~ Add option to not allow cuSolver during kernel generation Jan 23, 2026

Jack-Khuu requested review from Laurawly and kaiming-cheng January 23, 2026 23:56

Jack-Khuu marked this pull request as ready for review January 23, 2026 23:57

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add option to not allow cuSolver during kernel generation #82

Add option to not allow cuSolver during kernel generation #82

Uh oh!

Jack-Khuu commented Jan 23, 2026 •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Add option to not allow cuSolver during kernel generation #82

Are you sure you want to change the base?

Add option to not allow cuSolver during kernel generation #82

Uh oh!

Conversation

Jack-Khuu commented Jan 23, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Jack-Khuu commented Jan 23, 2026 •

edited

Loading